In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
train = pd.read_csv("data/train.csv", dtype={"Age": np.float64}, )
print(train.head(5))
In [2]:
# Replacing missing ages with median
train["Age"] = train["Age"].fillna(train["Age"].median())
train["ParentsAndChildren"] = train["Parch"]
train["SiblingsAndSpouses"] = train["SibSp"]
#train["Survived"][train["Survived"]==1] = "Survived"
#train["Survived"][train["Survived"]==0] = "Died"
# Replace all the occurences of male with the number 0.
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
# Find all the unique values for "Embarked".
train["Embarked"] = train["Embarked"].fillna("S")
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
In [3]:
#plt.figure()
data=train[["Fare","Survived","Age","ParentsAndChildren","SiblingsAndSpouses","Pclass","Embarked"]]
#sns.pairplot(data,hue="Survived", dropna=True)
#plt.savefig("1_seaborn_pair_plot.png")
data.tail()
Out[3]:
In [4]:
print(train["Cabin"].unique())
In [5]:
import re
import pandas
# A function to get the title from a name.
def get_cabin_section(cabin):
# Use a regular expression to search for the cabin data.
cabin_search = re.search('([A-Za-z]+)', cabin)
# If the cabin_search exists, extract and return it.
if cabin_search:
return cabin_search.group(1)
return "N"
In [6]:
# Get all the cabin sections
cabin_data = []
train["Cabin"] = train["Cabin"].fillna("N0")
cabin_section = train["Cabin"].apply(get_cabin_section)
In [8]:
print(cabin_section.head())
# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
section_mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G":7,"T":8, "N": 0}
for k,v in section_mapping.items():
cabin_section[cabin_section == k] = v
#train["CabinSection"] = cabin_section
In [9]:
train["CabinSection"] = cabin_section
print(train.head(5))
In [11]:
%matplotlib inline
plt.figure()
sns.pairplot(data=train[["Fare","Survived","Pclass","CabinSection"]],
hue="Survived", dropna=True)
#plt.savefig("1_seaborn_pair_plot.png")
Out[11]:
In [ ]:
In [ ]: